{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "2d202140",
   "metadata": {},
   "source": [
    "# Example of using sentence splitter chunking\n",
    "Compare the diff of splitting_1.txt and splitting_2.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a23c1a8-71ea-4b6d-ae42-5c1cf4014dff",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.text_splitter import TokenTextSplitter\n",
    "from llama_index import SimpleDirectoryReader, Document\n",
    "from llama_index.utils import globals_helper\n",
    "from langchain.text_splitter import (\n",
    "    NLTKTextSplitter,\n",
    "    SpacyTextSplitter,\n",
    "    RecursiveCharacterTextSplitter,\n",
    ")\n",
    "\n",
    "document = SimpleDirectoryReader(\"data\").load_data()[0]\n",
    "text_splitter_default = TokenTextSplitter()  # use default settings\n",
    "text_chunks = text_splitter_default.split_text(document.text)\n",
    "doc_chunks = [Document(text=t) for t in text_chunks]\n",
    "tokenizer = globals_helper.tokenizer\n",
    "with open(\"splitting_1.txt\", \"w\") as f:\n",
    "    for idx, doc in enumerate(doc_chunks):\n",
    "        f.write(\n",
    "            \"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text)))\n",
    "            + doc.text\n",
    "        )\n",
    "\n",
    "from llama_index.text_splitter import SentenceSplitter\n",
    "\n",
    "sentence_splitter = SentenceSplitter()\n",
    "text_chunks = sentence_splitter.split_text(document.text)\n",
    "doc_chunks = [Document(text=t) for t in text_chunks]\n",
    "with open(\"splitting_2.txt\", \"w\") as f:\n",
    "    for idx, doc in enumerate(doc_chunks):\n",
    "        f.write(\n",
    "            \"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text)))\n",
    "            + doc.text\n",
    "        )\n",
    "\n",
    "nltk_splitter = NLTKTextSplitter()\n",
    "text_chunks = nltk_splitter.split_text(document.text)\n",
    "doc_chunks = [Document(text=t) for t in text_chunks]\n",
    "tokenizer = globals_helper.tokenizer\n",
    "with open(\"splitting_3.txt\", \"w\") as f:\n",
    "    for idx, doc in enumerate(doc_chunks):\n",
    "        f.write(\n",
    "            \"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text)))\n",
    "            + doc.text\n",
    "        )\n",
    "\n",
    "# spacy_splitter = SpacyTextSplitter()\n",
    "# text_chunks = spacy_splitter.split_text(document.text)\n",
    "# tokenizer = globals_helper.tokenizer\n",
    "# with open('splitting_4.txt', 'w') as f:\n",
    "#     for idx, doc in enumerate(doc_chunks):\n",
    "#         f.write(\"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text))) + doc.text)\n",
    "\n",
    "# from langchain.text_splitter import TokenTextSplitter\n",
    "# token_text_splitter = TokenTextSplitter()\n",
    "# text_chunks = token_text_splitter.split_text(document.text)\n",
    "# doc_chunks = [Document(text=t) for t in text_chunks]\n",
    "# tokenizer = globals_helper.tokenizer\n",
    "# with open('splitting_5.txt', 'w') as f:\n",
    "#     for idx, doc in enumerate(doc_chunks):\n",
    "#         f.write(\"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text))) + doc.text)\n",
    "\n",
    "# recursive_splitter = RecursiveCharacterTextSplitter()\n",
    "# text_chunks = recursive_splitter.split_text(document.text)\n",
    "# doc_chunks = [Document(text=t) for t in text_chunks]\n",
    "# tokenizer = globals_helper.tokenizer\n",
    "# with open('splitting_6.txt', 'w') as f:\n",
    "#     for idx, doc in enumerate(doc_chunks):\n",
    "#         f.write(\"\\n-------\\n\\n{}. Size: {} tokens\\n\".format(idx, len(tokenizer(doc.text))) + doc.text)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "7e62ef7d",
   "metadata": {},
   "source": [
    "## Testing with Chinese"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44711ded",
   "metadata": {},
   "outputs": [],
   "source": [
    "from llama_index.text_splitter import SentenceSplitter\n",
    "from llama_index.schema import Document\n",
    "from llama_index.indices.service_context import ServiceContext\n",
    "from llama_index.node_parser.simple import SimpleNodeParser\n",
    "from llama_index.indices.vector_store import VectorStoreIndex\n",
    "import wikipedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e1262b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentence_splitter = SentenceSplitter()\n",
    "wikipedia.set_lang(\"zh\")\n",
    "page = wikipedia.page(\"美国\", auto_suggest=True).content\n",
    "sentence_splitter.split_text(page)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78dc563c",
   "metadata": {},
   "outputs": [],
   "source": [
    "node_parser = SimpleNodeParser.from_defaults(text_splitter=sentence_splitter)\n",
    "service_context = ServiceContext.from_defaults(node_parser=node_parser)\n",
    "documents = []\n",
    "documents.append(Document(text=page))\n",
    "index = VectorStoreIndex.from_documents(documents, service_context=service_context)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
